pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  mice,
  VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl  (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm  (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")

summary(flights_dt)
    dep_time                   sched_dep_time                  dep_delay          origin          origin_name       
 Min.   :2013-01-01 05:17:00   Min.   :2013-01-01 05:15:00   Min.   : -43.00   Length:327346      Length:327346     
 1st Qu.:2013-04-05 05:56:00   1st Qu.:2013-04-05 06:00:00   1st Qu.:  -5.00   Class :character   Class :character  
 Median :2013-07-04 09:52:30   Median :2013-07-04 09:54:30   Median :  -2.00   Mode  :character   Mode  :character  
 Mean   :2013-07-03 18:10:05   Mean   :2013-07-03 18:02:48   Mean   :  12.56                                        
 3rd Qu.:2013-10-01 18:12:45   3rd Qu.:2013-10-01 18:14:45   3rd Qu.:  11.00                                        
 Max.   :2013-12-31 23:56:00   Max.   :2013-12-31 23:59:00   Max.   :1301.00                                        
                                                                                                                    
    arr_time                   sched_arr_time                  arr_delay            dest          
 Min.   :2013-01-01 00:03:00   Min.   :2013-01-01 00:05:00   Min.   : -86.000   Length:327346     
 1st Qu.:2013-04-05 01:03:00   1st Qu.:2013-04-05 03:43:30   1st Qu.: -17.000   Class :character  
 Median :2013-07-04 11:36:30   Median :2013-07-04 11:55:30   Median :  -5.000   Mode  :character  
 Mean   :2013-07-03 19:41:02   Mean   :2013-07-03 19:59:24   Mean   :   6.895                     
 3rd Qu.:2013-10-01 20:07:00   3rd Qu.:2013-10-01 20:20:30   3rd Qu.:  14.000                     
 Max.   :2014-01-01 00:00:00   Max.   :2013-12-31 23:59:00   Max.   :1272.000                     
                                                                                                  
  dest_name            air_time        distance        flight       carrier          carrier_name      
 Length:327346      Min.   : 20.0   Min.   :  80   Min.   :   1   Length:327346      Length:327346     
 Class :character   1st Qu.: 82.0   1st Qu.: 509   1st Qu.: 544   Class :character   Class :character  
 Mode  :character   Median :129.0   Median : 888   Median :1467   Mode  :character   Mode  :character  
                    Mean   :150.7   Mean   :1048   Mean   :1943                                        
                    3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:3412                                        
                    Max.   :695.0   Max.   :4983   Max.   :8500                                        
                                                                                                       
   tailnum          manufacturer          model              engines          seats          engine         
 Length:327346      Length:327346      Length:327346      Min.   :1.00    Min.   :  2.0   Length:327346     
 Class :character   Class :character   Class :character   1st Qu.:2.00    1st Qu.: 55.0   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Median :2.00    Median :149.0   Mode  :character  
                                                          Mean   :1.99    Mean   :137.5                     
                                                          3rd Qu.:2.00    3rd Qu.:189.0                     
                                                          Max.   :4.00    Max.   :450.0                     
                                                          NA's   :48329   NA's   :48329                     
  aircraft_age        lat             lon               alt           time_hour                      wind_dir    
 Min.   : 8.00   Min.   :21.32   Min.   :-157.92   Min.   :   3.0   Min.   :2013-01-01 10:00:00   Min.   :  0.0  
 1st Qu.:15.00   1st Qu.:32.90   1st Qu.: -95.34   1st Qu.:  26.0   1st Qu.:2013-04-05 10:00:00   1st Qu.:130.0  
 Median :19.00   Median :36.08   Median : -83.99   Median : 433.0   Median :2013-07-04 13:00:00   Median :220.0  
 Mean   :19.59   Mean   :35.97   Mean   : -89.61   Mean   : 583.8   Mean   :2013-07-03 21:56:45   Mean   :201.9  
 3rd Qu.:22.00   3rd Qu.:41.41   3rd Qu.: -80.15   3rd Qu.: 748.0   3rd Qu.:2013-10-01 22:00:00   3rd Qu.:290.0  
 Max.   :65.00   Max.   :61.17   Max.   : -68.83   Max.   :6602.0   Max.   :2014-01-01 04:00:00   Max.   :360.0  
 NA's   :53493   NA's   :7537    NA's   :7537      NA's   :7537                                   NA's   :9574   
     humid             hour           minute           year           type                temp       
 Min.   : 12.74   Min.   : 5.00   Min.   : 0.00   Min.   :1956    Length:327346      Min.   : 10.94  
 1st Qu.: 43.74   1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:1999    Class :character   1st Qu.: 42.08  
 Median : 57.22   Median :13.00   Median :29.00   Median :2002    Mode  :character   Median : 57.20  
 Mean   : 59.21   Mean   :13.14   Mean   :26.23   Mean   :2001                       Mean   : 57.01  
 3rd Qu.: 74.67   3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2006                       3rd Qu.: 71.96  
 Max.   :100.00   Max.   :23.00   Max.   :59.00   Max.   :2013                       Max.   :100.04  
 NA's   :1544                                     NA's   :53493                      NA's   :1544    
      dewp         wind_speed         precip          pressure          visib      
 Min.   :-9.94   Min.   : 0.000   Min.   :0.0000   Min.   : 983.8   Min.   : 0.00  
 1st Qu.:26.06   1st Qu.: 6.905   1st Qu.:0.0000   1st Qu.:1012.9   1st Qu.:10.00  
 Median :42.80   Median :10.357   Median :0.0000   Median :1017.6   Median :10.00  
 Mean   :41.50   Mean   :11.060   Mean   :0.0042   Mean   :1017.9   Mean   : 9.29  
 3rd Qu.:57.92   3rd Qu.:14.960   3rd Qu.:0.0000   3rd Qu.:1022.9   3rd Qu.:10.00  
 Max.   :78.08   Max.   :42.579   Max.   :1.2100   Max.   :1042.1   Max.   :10.00  
 NA's   :1544    NA's   :1605     NA's   :1527     NA's   :36142    NA's   :1527   

Which airport in our dataset has the highest number of departures?

What is the trend of delays over the year, how does it compare across the three airports?

flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean departure delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

Which months have the highest and lowest mean departure delays?

# highest mean departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(desc(mean_delay))
# lowest departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(mean_delay)

What is the number of delayed flights over the year?

flights_dt %>%
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = "Number of delayed flights by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

Which month has the highest number of delays?

flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  arrange(desc(no_of_delays))

What were the weather conditions during these months?

flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    mean_pressure = mean(pressure, na.rm = TRUE),
    mean_precip = mean(precip, na.rm = TRUE),
    mean_temp = mean(temp, na.rm = TRUE),
    mean_wind_speed = mean(wind_speed, na.rm = TRUE),
    mean_wind_dir = mean(wind_dir, na.rm = TRUE),
    mean_dewp = mean(dewp, na.rm = TRUE),
    mean_visib = mean(visib, na.rm = TRUE),
    mean_humid = mean(humid, na.rm = TRUE)
  ) %>% 
  pivot_longer(
    cols = mean_pressure:mean_humid,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_line() +
  scale_colour_manual(values = cbPalette) +
  theme_bw()
Warning: Removed 8 row(s) containing missing values (geom_path).

flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = mean(dep_delay),
    pressure = mean(pressure),
    precipitation = mean(precip),
    temperature = mean(temp),
    wind_speed = mean(wind_speed),
    wind_direction = mean(wind_dir),
    dewpoint = mean(dewp),
    visibility = mean(visib),
    humidity = mean(humid)
  ) %>% 
  na.omit() %>% 
  kable(align = "c") %>%
  kable_styling(full_width = FALSE) %>% 
  column_spec(1:10, color = "black")
date departure_delay pressure precipitation temperature wind_speed wind_direction dewpoint visibility humidity
2013-12-02 60.08537 1013.267 0.0000000 46.32317 1.684068 11.58537 33.83073 7.402439 62.29524
2013-12-03 46.37931 1013.555 0.0000000 48.27759 5.297556 176.55172 33.99241 8.275862 59.37724
2013-12-04 58.64286 1019.479 0.0000000 47.24857 3.074227 39.14286 39.51371 6.314286 74.70871
2013-12-11 50.72414 1024.549 0.0000000 29.61655 12.248532 234.48276 16.42483 10.000000 57.65954
2013-12-12 41.20000 1024.931 0.0000000 24.20825 11.234490 251.25000 6.60200 10.000000 46.63287
2013-12-13 48.45902 1023.211 0.0000000 31.33311 10.772055 247.21311 14.74656 10.000000 49.98475
2013-12-16 43.13000 1020.163 0.0000000 27.44420 11.116535 279.50000 9.73220 9.950000 46.96260
2013-12-18 41.73529 1019.113 0.0000000 31.42294 9.950862 252.54902 19.94529 9.950980 62.86186
2013-12-19 53.08904 1019.290 0.0000000 38.14096 5.777546 199.93151 26.13644 9.739726 62.29753
2013-12-20 56.01198 1016.354 0.0000000 45.38790 7.194098 207.42515 32.40527 9.952096 60.93689
2013-12-21 56.14516 1014.031 0.0004032 57.21452 7.081009 195.16129 48.57452 9.943548 73.58613
2013-12-24 41.00000 1021.422 0.0000000 36.45655 12.599057 317.75862 20.25966 10.000000 52.09621
2013-12-27 49.68468 1026.695 0.0000000 37.00757 8.159134 248.01802 19.18919 10.000000 48.95180
2013-12-28 36.03896 1020.871 0.0000000 45.39247 12.688470 228.83117 25.24182 10.000000 47.41364
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = scale(dep_delay),
    pressure = scale(pressure),
    precipitation = scale(precip),
    temperature = scale(temp),
    wind_speed = scale(wind_speed),
    wind_direction = scale(wind_dir),
    dewpoint = scale(dewp),
    visibility = scale(visib),
    humidity = scale(humid)
  ) %>% 
  na.omit() %>% 
  pivot_longer(
    cols = departure_delay:humidity,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_point(alpha = 0.8) +
  scale_colour_manual(values = cbPalette) +
  scale_x_date(labels = date_format("%d"), breaks = date_breaks("day")) +
  labs(
    colour = "",
    title = "Weather variables December"
  ) +
  theme_bw()
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

What is the number of flights per month?

flights_dt %>%
  mutate(month = month(sched_dep_time, label = TRUE)) %>% 
  group_by(month, origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()

Which carrier has the most departure delays?

flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()

Which carrier has the most arrival delays?

flights_dt %>% 
  filter(arr_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()

Which season has the highest number of departure delays?

flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()

Over the course of a day when is the largest average delay?

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8, colour = "#56B4E9") +
  geom_smooth(se = FALSE, colour = "#E69F00") +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

How does distance of an outbound flight affect departure delays?

flights_dt %>% 
  filter(origin == "EWR") %>% 
  ggplot() +
  aes(x = distance) +
  geom_histogram(bins = 50) +
  theme_bw() +
  labs(title = "Distribution of flight distance")

positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
               ">3000mi")

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(distance = case_when(
    distance <= 500 ~ "<500mi",
    distance > 500 & distance <= 1000 ~ "500-1000mi",
    distance > 1000 & distance <= 1500 ~ "1000-1500mi",
    distance > 1500 & distance <= 2000 ~ "1500-2000mi",
    distance > 2000 & distance <= 2500 ~ "2000-2500mi",
    distance > 2500 & distance <= 3000 ~ "2500-3000mi",
    distance > 3000 ~ ">3000mi"
  )) %>% 
  group_by(distance) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = distance, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "distance (miles)",
    y = "mean departure delay (minutes)",
    title = "Departure delay by flight distance"
  ) +
  scale_x_discrete(limits = positions) +
  theme_bw()

Which day is the best day to travel from Newark Int.?

flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(weekday = wday(sched_dep_time, label = TRUE)) %>% 
  group_by(weekday) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = weekday, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "weekday",
    y = "mean departure delay (minutes)",
    title = "Departure delay by weekday"
  ) +
  theme_bw()

Which destinations suffer from the longest delays?

flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()

Trend weather conditions against mean departure delays

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

---
title: "Exploratory Analysis"
output: html_notebook
---

```{r, warning=FALSE,message=FALSE}
pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  leaps,
  glmulti,
  broom,
  rpart,
  rpart.plot,
  modelr,
  yardstick,
  caret,
  ranger,
  kableExtra
)
```

```{r, warning=FALSE,message=FALSE}
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")

summary(flights_dt)
```

# Which airport in our dataset has the highest number of departures?

```{r}
flights_dt %>% 
  group_by(origin) %>%
  ggplot() +
  aes(x = origin, fill = origin) +
  geom_bar(alpha = 0.8) +
  scale_fill_manual(values = cbPalette) +
  labs(
    x = "airport",
    y = "departure numbers",
    title = "Departure numbers by airport"
  ) +
  guides(fill = "none") +
  theme_bw()
```

# What is the trend of delays over the year, how does it compare across the three airports?

```{r,warning=FALSE}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean departure delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```
# Which months have the highest and lowest mean departure delays?

```{r}
# highest mean departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(desc(mean_delay))
```

```{r}
# lowest departure delay by month
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  arrange(mean_delay)
```

# What is the number of delayed flights over the year?

```{r}
flights_dt %>%
  filter(dep_delay >= 15) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = "Number of delayed flights by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```
# Which month has the highest number of delays?

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  arrange(desc(no_of_delays))
```

# What were the weather conditions during these months?

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    mean_pressure = mean(pressure, na.rm = TRUE),
    mean_precip = mean(precip, na.rm = TRUE),
    mean_temp = mean(temp, na.rm = TRUE),
    mean_wind_speed = mean(wind_speed, na.rm = TRUE),
    mean_wind_dir = mean(wind_dir, na.rm = TRUE),
    mean_dewp = mean(dewp, na.rm = TRUE),
    mean_visib = mean(visib, na.rm = TRUE),
    mean_humid = mean(humid, na.rm = TRUE)
  ) %>% 
  pivot_longer(
    cols = mean_pressure:mean_humid,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_line() +
  scale_colour_manual(values = cbPalette) +
  theme_bw()
```

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = mean(dep_delay),
    pressure = mean(pressure),
    precipitation = mean(precip),
    temperature = mean(temp),
    wind_speed = mean(wind_speed),
    wind_direction = mean(wind_dir),
    dewpoint = mean(dewp),
    visibility = mean(visib),
    humidity = mean(humid)
  ) %>% 
  na.omit() %>% 
  kable(align = "c") %>%
  kable_styling(full_width = FALSE) %>% 
  column_spec(1:10, color = "black")
```  

```{r}
flights_dt %>% 
  mutate(date = as.Date(dep_time, label = TRUE),
         month = month(dep_time, label = TRUE)) %>% 
  filter(dep_delay >= 15 & origin == "EWR" & month == "Dec") %>%
  select(date, dep_delay, month, pressure, precip, temp, wind_speed, wind_dir, dewp, visib, humid) %>% 
  group_by(date) %>% 
  summarise(
    departure_delay = scale(dep_delay),
    pressure = scale(pressure),
    precipitation = scale(precip),
    temperature = scale(temp),
    wind_speed = scale(wind_speed),
    wind_direction = scale(wind_dir),
    dewpoint = scale(dewp),
    visibility = scale(visib),
    humidity = scale(humid)
  ) %>% 
  na.omit() %>% 
  pivot_longer(
    cols = departure_delay:humidity,
    names_to = "variable",
    values_to = "value"
  ) %>% 
  ggplot() +
  aes(x = date, y = value, colour = variable) +
  geom_point(alpha = 0.8) +
  scale_colour_manual(values = cbPalette) +
  scale_x_date(labels = date_format("%d"), breaks = date_breaks("day")) +
  labs(
    colour = "",
    title = "Weather variables December"
  ) +
  theme_bw()
```

# What is the number of flights per month?

```{r}
flights_dt %>%
  mutate(month = month(sched_dep_time, label = TRUE)) %>% 
  group_by(month, origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()
```

# Which carrier has the most departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which carrier has the most arrival delays?

```{r}
flights_dt %>% 
  filter(arr_delay >= 15) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which season has the highest number of departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# Over the course of a day when is the largest average delay?

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8, colour = "#56B4E9") +
  geom_smooth(se = FALSE, colour = "#E69F00") +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
```
# How does distance of an outbound flight affect departure delays?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  ggplot() +
  aes(x = distance) +
  geom_histogram(bins = 50) +
  theme_bw() +
  labs(title = "Distribution of flight distance")
```

```{r}
positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
               ">3000mi")

flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(distance = case_when(
    distance <= 500 ~ "<500mi",
    distance > 500 & distance <= 1000 ~ "500-1000mi",
    distance > 1000 & distance <= 1500 ~ "1000-1500mi",
    distance > 1500 & distance <= 2000 ~ "1500-2000mi",
    distance > 2000 & distance <= 2500 ~ "2000-2500mi",
    distance > 2500 & distance <= 3000 ~ "2500-3000mi",
    distance > 3000 ~ ">3000mi"
  )) %>% 
  group_by(distance) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = distance, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "distance (miles)",
    y = "mean departure delay (minutes)",
    title = "Departure delay by flight distance"
  ) +
  scale_x_discrete(limits = positions) +
  theme_bw()
```

# Which day is the best day to travel from Newark Int.?

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  mutate(weekday = wday(sched_dep_time, label = TRUE)) %>% 
  group_by(weekday) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = weekday, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "weekday",
    y = "mean departure delay (minutes)",
    title = "Departure delay by weekday"
  ) +
  theme_bw()
```

# Which are the most popular destinations from Newark Int?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, count), y = count) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "number of flights per year",
    title = "Destinations from Newark Int."
  ) +
  theme_bw() +
  coord_flip()
```

# Which destinations suffer from the longest delays?

```{r}
flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()
```

# Trend weather conditions against mean departure delays

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay >= 15 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
```



